# Import necessary modules for data cleaning and data visualization
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
We will first load in data files
dot_traffic_stations = pd.read_csv("dot_traffic_stations_2015.txt", sep = ',')
dot_traffic_stations.head()
Get some info
dot_traffic_stations.info()
We will mainly focus on traffic data
dot_traffic = pd.read_csv("dot_traffic_2015.txt")
dot_traffic.head()
dot_traffic.info()
# Find columns with null values
dot_traffic.columns[dot_traffic.isnull().sum() > 0]
dot_traffic.restrictions.nunique()
dot_traffic.record_type.nunique()
dot_traffic['year_of_data'].nunique()
Upon closer look at the data, I will remove restrictions column, record type and year of data as it's redundant. Other than that, the dataset looks great as other columns do not have missing values.
dot_traffic= dot_traffic.drop(columns=['restrictions','record_type','year_of_data'])
As direction_of_travel is just the encoded value of direction_of_travel_name, direction_of_travel will be dropped. This is the same for functional classification
dot_traffic= dot_traffic.drop(columns=['direction_of_travel','functional_classification'])
We will convert date column to datetime datatype and a new feature called day of month
dot_traffic['date'] = pd.to_datetime(dot_traffic['date'])
dot_traffic['day_of_month'] = dot_traffic.date.dt.day
dot_traffic.head()
For easy grabbing 24 columns of hours text, i created a list just for that.
traffic_hours_columns = []
for col in dot_traffic.columns.values.tolist():
if "traffic_volume_counted" in col:
traffic_hours_columns.append(col)
traffic_hours_columns
We will also generate daily total traffic volume by summing up all the traffic of all the hours of a day
dot_traffic['total_daily_traffic_volume'] = dot_traffic[traffic_hours_columns].sum(axis=1)
dot_traffic
Let's now find some patterns from data using visualizations. We will mainly feature engineering the data by groupby function is pandas which allows us to group your data and perform computations. We will mainly use Plotly to plot some bar chart and line chart visualizations that allow you to interact as well as matplotlib and seaborn for heatmap!
To visualize the traffic patterns, we will make use of the follow data:
Let's first visualize the traffic data by hour and day of week
day_of_week_hour_fig = px.line(dot_traffic.groupby(['day_of_week'])[traffic_hours_columns].mean().reset_index(),
x="day_of_week", y=traffic_hours_columns, title='Average traffic volume by day of week and hour')
day_of_week_hour_fig.show()
We can see that traffic volume generally increases sharply on Mondays across all hours, with a stable gradual rate of increase until Friday. This rise is in sharp contrast with that on Saturday and Sunday.
During the weekends, while "normal" hours experience a drastic decline in traffic flow, "less normal" hours such as 10-11am/11-12am and "odd" hours of 12-1am witness an abnormal increase in traffic.
These can be credited to the fact that during weekdays, traffic follows business hours (i.e. 9-5) where more people are out traveling, and during weekends, less people travel and they do so later than usual.
# We will first create a list containing sum of total traffic by hour
total_traffic_by_hour = []
for col in traffic_hours_columns:
total_traffic_by_hour.append(dot_traffic[col].sum())
# Plot bar chart after creating a dataframe consist of 2 columns: traffic hour and total
hour_fig = px.bar(pd.DataFrame(list(zip(traffic_hours_columns, total_traffic_by_hour)),
columns =['traffic_hours_columns', 'Total']),
y="traffic_hours_columns", x="Total", title='Total traffic by hour')
hour_fig.show()
When hovering on the bars, we can see that traffic surge twice between 7 to 8am in the morning, and 4 to 5pm in the afternoon. Traffic in the afternoon is higher than in the morning.
day_of_week_fig = px.bar(dot_traffic.groupby(['day_of_week'])['total_daily_traffic_volume'].mean().reset_index(),
x="day_of_week", y="total_daily_traffic_volume", title='Average traffic by day of week')
day_of_week_fig.show()
We can see that total daily avarage traffic increases during the week and peak on Friday then decreases during the weekends.
day_of_month_fig = px.bar(dot_traffic.groupby(['day_of_month'])['total_daily_traffic_volume'].mean().reset_index(),
x="day_of_month", y="total_daily_traffic_volume", title='Average traffic by day of month')
day_of_month_fig.show()
We can see that through the month, traffic remains similar. If we look closely, there is small increase towards the end of the month but the increase is almost negligible.
year_fig = px.line(dot_traffic.groupby(['date'])['total_daily_traffic_volume'].mean().reset_index(),
x="date",
y='total_daily_traffic_volume',
title='Traffic over the year')
year_fig.update_yaxes(automargin=True)
year_fig.show()
We can that traffic tends to be higher in the middle of the year! Perhaps people like to travel in the summer!
traffic_by_day = dot_traffic.groupby(['date'])[["day_of_month","month_of_data",'total_daily_traffic_volume']].mean()
traffic_by_day_pivot = traffic_by_day.pivot(columns='day_of_month', index='month_of_data', values='total_daily_traffic_volume')
fig, ax = plt.subplots(figsize=(35,23))
sns.heatmap(traffic_by_day_pivot, annot=True,cmap="coolwarm",linewidths=.05, fmt='g')
We can also view the heatmaps of traffic by different day and month. We can see some biggest dip in traffic during holidays such as 25/12 or 1/1.
Next we will visualize how the American travel
functional_fig = px.bar(dot_traffic.groupby(['functional_classification_name'])['total_daily_traffic_volume'].mean().reset_index(),
x="functional_classification_name", y="total_daily_traffic_volume",
title='Average traffic by functional name')
functional_fig.show()
The majority of traffic in the US are in the Princial Arterial, mostly in the Urban areas.
We can also view the traffic throughout different day of week by different function names.
functional_by_day_of_week = dot_traffic.groupby(['functional_classification_name','day_of_week'])['total_daily_traffic_volume'].mean().reset_index()
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(functional_by_day_of_week.pivot(columns='functional_classification_name',
index='day_of_week', values='total_daily_traffic_volume'),
annot=True,linewidths=.05, fmt='g', )
We can see that the Rural: Principal Arterial - Interstate has a big jump in traffic on Friday while others might only increase slight. Perhaps on Friday, a lot of people travels out to rural areas.
We might also be interested in the main directions of the traffic flow.
direction_fig = px.bar(dot_traffic.groupby(['direction_of_travel_name'])['total_daily_traffic_volume'].sum().reset_index(),
x="direction_of_travel_name", y="total_daily_traffic_volume",
title='Average traffic by direction of travel')
direction_fig.show()
North-South seems to be the most common directions, followed closely by East-West
Traffic data also include fips state code, we can see which states has the most total traffic
traffic_by_fips_state_code = dot_traffic.groupby(['fips_state_code'])['total_daily_traffic_volume'].sum().reset_index()
traffic_by_fips_state_code.head()
Our data doesn't include state names. But fret not, there is a package that contains US general data called us.
We can simply install it by !pip install us
import us
# Initialize fips to name mapping
fips_to_name = us.states.mapping("fips", "name")
# We need to add "0" to fips state code with only 1 digits.
traffic_by_fips_state_code.fips_state_code = traffic_by_fips_state_code.fips_state_code.astype(str).str.zfill(2)
traffic_by_fips_state_code["states"] = traffic_by_fips_state_code["fips_state_code"].map(fips_to_name)
traffic_by_fips_state_code.head()
A quick check showed that the state codes and state names are correct. Now let's see which state has the most traffic.
state_fig = px.bar(traffic_by_fips_state_code, x="states", y="total_daily_traffic_volume",
title='total traffic by state')
state_fig.update_layout(xaxis={'categoryorder':'total descending'})
state_fig.show()
We can see that Virginia, California and Arizona has the highest total traffic!
The last piece of useful data is lane of travel. Let's find out what does each lane of travel represent
dot_traffic_stations['lane_of_travel_name'].unique()
dot_traffic_stations['lane_of_travel'].unique()
So the traffic stations only captures 3 types of lane name and 9 different lanes. Let see what each lane name corresponds to
dot_traffic_stations[['lane_of_travel_name','lane_of_travel']][dot_traffic_stations['lane_of_travel_name']=='Other lanes'].head()
dot_traffic_stations[['lane_of_travel_name','lane_of_travel']][dot_traffic_stations['lane_of_travel_name']=='Outside (rightmost) lane'].head()
dot_traffic_stations[['lane_of_travel_name','lane_of_travel']][dot_traffic_stations['lane_of_travel_name']=='Data with lanes combined'].head()
So lane 0 means data with lanes combined. Lane 1 is outside (rightmost) lane. Lane 2-9 is 'others'
lane_fig = px.bar(dot_traffic.groupby(['lane_of_travel'])['total_daily_traffic_volume'].sum().reset_index(),
x="lane_of_travel", y="total_daily_traffic_volume",
title='total traffic by lane')
lane_fig.update_layout(xaxis={'categoryorder':'total descending'})
lane_fig.show()
The majority of sensors data combines the lane data. Apart from that, we can now see that there are more traffic on the right sides!
These were some interesting patterns that we can derive from the traffic dataset! The next part of this notebook will be modelling the data
We would like to model the traffic data using machine learning techniques. For this purpose, I will be using Pycaret. PyCaret is an open-source, low-code machine learning library and end-to-end model management tool built-in Python for automating machine learning workflows.
The installation instructions for Pycaret is at https://pycaret.org/install/. For my Jupyter Notebook, to prevent potential conflict with other Python packages, I created a new Conda environment and install Pycaret into it.
Just a quick recap, this is the traffic data throughout the whole year and we want our model to recognize the patterns from this
year_fig.show()
We will first feature engineer the dataset for our models. Date, day of month, month and day of week will be used as the features
data = dot_traffic.groupby(['date',"day_of_month","month_of_data","day_of_week"])[['total_daily_traffic_volume']].mean().reset_index()
data.head()
We will split the 80% of the dataset to be used for training, the remaining 20% are unseen data that will be used for validation
from pycaret.regression import *
train_test_data = data.sample(frac=0.8, random_state=123)
validation_data = data.drop(train_test_data.index)
We can initialize Pycaret by simply using setup() where we choose the data and target column!
exp_reg101 = setup(data = train_test_data, test_data = validation_data, target = 'total_daily_traffic_volume', session_id = 123)
We will run compare_models() which will fit various machine learning models on the dataset and then sort the results based on R2 score
best = compare_models(sort = 'R2')
Huber Regressor seems to perform the best with R2 of around 0.7, Root Mean Squared Error of 544 is also not bad, the lowest of all models!
We will then create the model. When creating, we can also see the 10-fold cross validation results.
best_model = create_model('huber')
Finetuning of model is also just another quick function. This is done using randomized grid search in the backend, by default it will optimize the 'R2' score
best_model = tune_model(best_model)
This is the our model with the parameters.
print(best_model)
We will then test the model on the unseen data
prediction_holdout = predict_model(best_model)
The RMSE and R2 is even better than the training set! The model is not overfitted or underfitted and perform well on unseen data.
While R2 and RMSE looks decent. It's highly important to look at the residual and prediction error plot.
Let's take a look at residuals plot and prediction error plot using the built-in plot_model function.
plot_model(best_model)
The residuals are pretty symmetrically distributed, tending to cluster towards the middle of the plot. This is a good model.
plot_model(best_model, plot = 'error')
Apart from some "anomaly" values, the line of best fit seems to generalize well.
Let's visualize the predictions points on the actual traffic data plot we saw earlier.
# Get dataframe of predictions
predictions = predict_model(best_model, data=validation_data)
predictions.head()
# Concatenate validation data prediction with train data
predict_train_concat = pd.concat([predictions,train_test_data])
import plotly.graph_objs as go
fig12 = px.line(data, x="date", y=['total_daily_traffic_volume'], title='Traffic over the year', template = 'plotly_dark')
fig12.add_trace(go.Scatter(x=predict_train_concat.date.tolist(), y=predict_train_concat.Label.tolist(), mode = 'markers',
name = 'Predictions',
marker=dict(color='red',size=5)))
fig12.show()
Most predictions seems to be within the line plot.
We can make a scatter plots to visualize this even better.
fig13 = px.scatter(predictions, x="date", y=['total_daily_traffic_volume','Label'], title='Traffic over the year', template = 'plotly_dark')
fig13.show()
We can see that the predictions are closer to actual when they are on the "normal" days but not so much for some days when there are particularly more or less traffic.
From the plots and metrics, can see that the model perform generally well, except for some anamaly data which in reality might be real anomalies such as holidays, etc.
Considering that we use relatively few features, the model performs admirably well. With more features such as weather and holidays, the model should perform even better.
We can now save the model.
save_model(best_model, 'huber_regressor_us_traffic_2015')
The model is saved as pkl file in the same directory. It can also be loaded next time using load_model().
saved_model = load_model('huber_regressor_us_traffic_2015')
We can perform further prediction with predict_model() by adding in our saved_model and the test dataframe
The last thing we can try is anomaly detection with traffic data.
We first need to import anomaly modules and setup the data.
from pycaret.anomaly import *
s = setup(data, session_id = 123)
We can view a list of all anomaly models available.
# check list of available models
models()
We can create an isolation forest model from the list above then we apply the model on the dataset. For this dataset, we'll set the fraction of anomaly data to be 5% of total data.
iforest = create_model('iforest')
iforest_results = assign_model(iforest)
iforest_results.head()
We can take a quick look at the model results.
iforest_results[iforest_results['Anomaly'] == 1].head()
We can plot the anomaly value on the traffic plot that we saw earlier.
fig14 = px.line(iforest_results, x=iforest_results.date, y="total_daily_traffic_volume",
title='Unsupervised anomaly detection daily traffic data')
# create list of outlier_dates
outlier_dates = iforest_results[iforest_results['Anomaly'] == 1].date.to_list()
# obtain y value of anomalies to plot
anomalies = iforest_results[iforest_results['Anomaly'] == 1].total_daily_traffic_volume.to_list()
fig14.add_trace(go.Scatter(x=outlier_dates, y=anomalies, mode = 'markers',
name = 'Anomaly',
marker=dict(color='red',size=8)))
fig14.show()
We can see that the models were able to pick up some anomaly dates such as 31/12 or 1/1 or 31/5 which are public holidays in US. Some other anomalies are days that see unusually low traffic such as 31/1 and 1/2 would also be worth looking at!
We can save this model for later use
save_model(iforest, 'iforest_anomaly_detection_us_traffic')
We have reached the end of this notebook. So far we have taken a peak at the dataset, perform some feature engineering and then visualize the data to gain some insights. Lastly we have tried various models to model the data and detect anomalies in the data.
Thank you for reading!